# Import libraries
library(ggplot2)
library(dplyr)
library(lattice)
library(tidyverse)
library(lubridate)
library(fpp3)
library(fable)
library(fabletools)
library(feasts)
library(corrplot)
library(forecast)
library(tsibble)
library(zoo)
library(future)
library(tseries)
library(bookdown)
library(Metrics)
library(wavelets)
library(randomForest)
library(xgboost)
library(caret)
library(fable)
This report provides a detailed analysis of predicting electricity demand in New South Wales (NSW) using advanced statistical and modeling approaches to enhance precision. It examines data from January 2010 to March 2021, featuring half-hourly figures on electricity usage and temperature. The objective is to develop and refine models for better forecasting accuracy.
# Since the original forecastdemand file is too large, it is divided into 5 parts
forecastDemand1 <- read_csv(unzip("../data/NSW/forecastdemand/forecastdemand_part1.csv.zip", files = "forecastdemand_part1.csv", exdir = tempdir())[1])
forecastDemand2 <- read_csv(unzip("../data/NSW/forecastdemand/forecastdemand_part2.csv.zip", files = "forecastdemand_part2.csv", exdir = tempdir())[1])
forecastDemand3 <- read_csv(unzip("../data/NSW/forecastdemand/forecastdemand_part3.csv.zip", files = "forecastdemand_part3.csv", exdir = tempdir())[1])
forecastDemand4 <- read_csv(unzip("../data/NSW/forecastdemand/forecastdemand_part4.csv.zip", files = "forecastdemand_part4.csv", exdir = tempdir())[1])
forecastDemand5 <- read_csv(unzip("../data/NSW/forecastdemand/forecastdemand_part5.csv.zip", files = "forecastdemand_part5.csv", exdir = tempdir())[1])
dfForecastDemand <- bind_rows(forecastDemand1, forecastDemand2, forecastDemand3, forecastDemand4,forecastDemand5)
## Reference: https://www.michaelplazzer.com/datasets/australian-public-holiday-data/
dfHoliday <- read_csv('../data/Aus_public_hols_2009-2022-1.csv')
dfHoliday <- dfHoliday %>%
filter(State == 'NSW')
countNA <- function(dataFrame) {
# Calculate the total number of NA values in the dataframe
totalNA <- sum(is.na(dataFrame))
# Check if there are any NA values
if (totalNA > 0) {
cat("True\n")
cat("Total number of NA values:", totalNA, "\n")
} else {
cat("False\n")
cat("No NA values present.\n")
}
}
countNA(dfDemand)
## False
## No NA values present.
countNA(dfTemp)
## False
## No NA values present.
countNA(dfForecastDemand)
## False
## No NA values present.
countNA(dfHoliday)
## False
## No NA values present.
# Check for duplicates
duplicates <- duplicated(dfTemp)
# View the rows that are duplicates
dfTemp[duplicates, ]
## # A tibble: 13 × 3
## LOCATION DATETIME TEMPERATURE
## <chr> <chr> <dbl>
## 1 Bankstown 1/1/2011 0:00 21
## 2 Bankstown 10/10/2011 10:30 18.9
## 3 Bankstown 10/10/2011 18:30 16.1
## 4 Bankstown 10/10/2011 19:30 15.5
## 5 Bankstown 1/1/2012 0:00 15.4
## 6 Bankstown 1/1/2013 0:00 21
## 7 Bankstown 1/1/2014 0:00 20.4
## 8 Bankstown 1/1/2015 0:00 20.9
## 9 Bankstown 1/1/2016 0:00 16.9
## 10 Bankstown 1/1/2017 0:00 22.6
## 11 Bankstown 1/1/2018 0:00 22.4
## 12 Bankstown 1/1/2019 0:00 22.3
## 13 Bankstown 1/1/2020 0:00 19.4
summary(dfDemand)
## DATETIME TOTALDEMAND REGIONID
## Length:196513 Min. : 5075 Length:196513
## Class :character 1st Qu.: 7150 Class :character
## Mode :character Median : 8053 Mode :character
## Mean : 8113
## 3rd Qu.: 8959
## Max. :14580
convertDateTime <- function(df, dateTimeCol = "DATETIME", timeZone = "Australia/Brisbane") {
# Check for the presence of the specified datetime column
if (!dateTimeCol %in% names(df)) {
stop(paste(dateTimeCol, "column not found in the dataframe."))
}
# Convert the datetime column to a datetime object
df[[dateTimeCol]] <- dmy_hm(df[[dateTimeCol]], tz = timeZone)
return(df)
}
addTimeAttr <- function(df, dateTimeCol = "DATETIME") {
# Check if the datetime column exists and is of POSIXct type
if (!dateTimeCol %in% names(df) || !inherits(df[[dateTimeCol]], "POSIXct")) {
stop(paste(dateTimeCol, "column is not properly formatted as datetime in the dataframe."))
}
# Add derived time attributes
df <- df %>%
mutate(
yearValue = year(.data[[dateTimeCol]]),
monthValue = month(.data[[dateTimeCol]]),
dayValue = day(.data[[dateTimeCol]]),
hourValue = hour(.data[[dateTimeCol]]),
minuteValue = minute(.data[[dateTimeCol]]),
timeOfDay = hour(.data[[dateTimeCol]]) + minute(.data[[dateTimeCol]]) / 60,
weekOfMonth = factor(
week(.data[[dateTimeCol]]) - week(floor_date(.data[[dateTimeCol]], "month")) + 1,
levels = 1:6 # Levels for weeks in a month
),
dayOfWeek = factor(
wday(.data[[dateTimeCol]], label = TRUE),
levels = c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat")
)
)
return(df)
}
dfDemand <- convertDateTime(dfDemand)
head(dfDemand)
## # A tibble: 6 × 3
## DATETIME TOTALDEMAND REGIONID
## <dttm> <dbl> <chr>
## 1 2010-01-01 00:00:00 8038 NSW1
## 2 2010-01-01 00:30:00 7809. NSW1
## 3 2010-01-01 01:00:00 7484. NSW1
## 4 2010-01-01 01:30:00 7117. NSW1
## 5 2010-01-01 02:00:00 6812. NSW1
## 6 2010-01-01 02:30:00 6544. NSW1
dfDemand <- addTimeAttr(dfDemand)
dfDemand <- addTimeAttr(dfDemand)
head(dfDemand)
## # A tibble: 6 × 11
## DATETIME TOTALDEMAND REGIONID yearValue monthValue dayValue
## <dttm> <dbl> <chr> <dbl> <dbl> <int>
## 1 2010-01-01 00:00:00 8038 NSW1 2010 1 1
## 2 2010-01-01 00:30:00 7809. NSW1 2010 1 1
## 3 2010-01-01 01:00:00 7484. NSW1 2010 1 1
## 4 2010-01-01 01:30:00 7117. NSW1 2010 1 1
## 5 2010-01-01 02:00:00 6812. NSW1 2010 1 1
## 6 2010-01-01 02:30:00 6544. NSW1 2010 1 1
## # ℹ 5 more variables: hourValue <int>, minuteValue <int>, timeOfDay <dbl>,
## # weekOfMonth <fct>, dayOfWeek <ord>
dfDemand <- dfDemand %>%
select(-'REGIONID')
head(dfDemand)
## # A tibble: 6 × 10
## DATETIME TOTALDEMAND yearValue monthValue dayValue hourValue
## <dttm> <dbl> <dbl> <dbl> <int> <int>
## 1 2010-01-01 00:00:00 8038 2010 1 1 0
## 2 2010-01-01 00:30:00 7809. 2010 1 1 0
## 3 2010-01-01 01:00:00 7484. 2010 1 1 1
## 4 2010-01-01 01:30:00 7117. 2010 1 1 1
## 5 2010-01-01 02:00:00 6812. 2010 1 1 2
## 6 2010-01-01 02:30:00 6544. 2010 1 1 2
## # ℹ 4 more variables: minuteValue <int>, timeOfDay <dbl>, weekOfMonth <fct>,
## # dayOfWeek <ord>
# Calculate sequence of years for demarcation
yearSeq <- seq(from = floor_date(min(dfDemand$DATETIME), "year"),
to = ceiling_date(max(dfDemand$DATETIME), "year"),
by = "1 year")
# Enhanced visual plot using ggplot2
dfDemand %>%
ggplot(aes(x = DATETIME, y = TOTALDEMAND)) +
geom_line(color = "deepskyblue4", linewidth = 1) +
geom_vline(data = data.frame(x = yearSeq),
aes(xintercept = as.numeric(x)),
linetype = "dashed", color = "firebrick1") +
scale_x_datetime(date_breaks = '1 year', date_labels = '%Y') +
labs(x = "Year", y = "Electricity Demand",
title = "Yearly Electricity Demand in NSW") +
theme_light(base_size = 10) +
theme(plot.title = element_text(hjust = 0.5, face="bold"),
axis.title.x = element_text(face="italic"),
axis.title.y = element_text(face="italic"))
# Create a new column 'Season' according to the month
dfDemandSeason <- dfDemand %>%
mutate(Season = case_when(
monthValue %in% c(9, 10, 11) ~ "Spring",
monthValue %in% c(12, 1, 2) ~ "Summer",
monthValue %in% c(3, 4, 5) ~ "Autumn",
monthValue %in% c(6, 7, 8) ~ "Winter",
TRUE ~ NA_character_))
dfDemandSeason$hourValue <- factor(dfDemandSeason$hourValue, levels = seq(0, 23))
# Plot with adjusted y-axis labels
bwplot(TOTALDEMAND ~ hourValue | Season, data = dfDemandSeason,
layout = c(2, 2),
xlab = "Hour of the Day", ylab = "Total Demand",
main = "Hourly Electricity Demand by Season",
scales = list(
x = list( cex = 0.5),
y = list(rot = 90, cex = 0.7)
))
summary(dfTemp)
## LOCATION DATETIME TEMPERATURE
## Length:220326 Length:220326 Min. :-1.30
## Class :character Class :character 1st Qu.:13.40
## Mode :character Mode :character Median :17.70
## Mean :17.42
## 3rd Qu.:21.30
## Max. :44.70
# Remove 'LOCATION' column from dfTemp
dfTemp <- dfTemp %>% select(-'LOCATION')
head(dfTemp)
## # A tibble: 6 × 2
## DATETIME TEMPERATURE
## <chr> <dbl>
## 1 1/1/2010 0:00 23.1
## 2 1/1/2010 0:01 23.1
## 3 1/1/2010 0:30 22.9
## 4 1/1/2010 0:50 22.7
## 5 1/1/2010 1:00 22.6
## 6 1/1/2010 1:30 22.5
dfTemp <- convertDateTime(dfTemp)
head(dfTemp)
## # A tibble: 6 × 2
## DATETIME TEMPERATURE
## <dttm> <dbl>
## 1 2010-01-01 00:00:00 23.1
## 2 2010-01-01 00:01:00 23.1
## 3 2010-01-01 00:30:00 22.9
## 4 2010-01-01 00:50:00 22.7
## 5 2010-01-01 01:00:00 22.6
## 6 2010-01-01 01:30:00 22.5
dfTemp <- addTimeAttr((dfTemp))
head(dfTemp)
## # A tibble: 6 × 10
## DATETIME TEMPERATURE yearValue monthValue dayValue hourValue
## <dttm> <dbl> <dbl> <dbl> <int> <int>
## 1 2010-01-01 00:00:00 23.1 2010 1 1 0
## 2 2010-01-01 00:01:00 23.1 2010 1 1 0
## 3 2010-01-01 00:30:00 22.9 2010 1 1 0
## 4 2010-01-01 00:50:00 22.7 2010 1 1 0
## 5 2010-01-01 01:00:00 22.6 2010 1 1 1
## 6 2010-01-01 01:30:00 22.5 2010 1 1 1
## # ℹ 4 more variables: minuteValue <int>, timeOfDay <dbl>, weekOfMonth <fct>,
## # dayOfWeek <ord>
summary(dfTemp)
## DATETIME TEMPERATURE yearValue
## Min. :2010-01-01 00:00:00.00 Min. :-1.30 Min. :2010
## 1st Qu.:2012-11-02 23:07:30.00 1st Qu.:13.40 1st Qu.:2012
## Median :2015-08-16 13:45:00.00 Median :17.70 Median :2015
## Mean :2015-08-16 07:59:12.96 Mean :17.42 Mean :2015
## 3rd Qu.:2018-06-03 14:22:30.00 3rd Qu.:21.30 3rd Qu.:2018
## Max. :2021-03-18 00:00:00.00 Max. :44.70 Max. :2021
##
## monthValue dayValue hourValue minuteValue
## Min. : 1.000 Min. : 1.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 3.000 1st Qu.: 8.00 1st Qu.: 5.00 1st Qu.: 0.00
## Median : 6.000 Median :16.00 Median :11.00 Median :29.00
## Mean : 6.423 Mean :15.66 Mean :11.41 Mean :16.51
## 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:17.00 3rd Qu.:30.00
## Max. :12.000 Max. :31.00 Max. :23.00 Max. :59.00
##
## timeOfDay weekOfMonth dayOfWeek
## Min. : 0.00 1:28907 Sun:31373
## 1st Qu.: 5.50 2:51106 Mon:31301
## Median :11.50 3:50611 Tue:31796
## Mean :11.69 4:50803 Wed:31736
## 3rd Qu.:17.50 5:36736 Thu:31356
## Max. :23.98 6: 2163 Fri:31357
## Sat:31407
Since the temperature data updates occur at irregular minute intervals and the electricity demand data are recording every 30 minutes, temperature data with minuteValue other than 0 and 30 are removed.
# Remove rows from dfTemp where minuteValue is not 0 or 30 and store the result in dfTemp
dfTemp <- dfTemp %>%
filter(minuteValue == 0 | minuteValue == 30)
# , print the resulting dataframe to verify the rows have been filtered correctly
summary(dfTemp)
## DATETIME TEMPERATURE yearValue
## Min. :2010-01-01 00:00:00.00 Min. :-1.30 Min. :2010
## 1st Qu.:2012-10-24 01:15:00.00 1st Qu.:13.50 1st Qu.:2012
## Median :2015-08-10 17:30:00.00 Median :17.90 Median :2015
## Mean :2015-08-12 09:19:17.21 Mean :17.53 Mean :2015
## 3rd Qu.:2018-06-01 10:15:00.00 3rd Qu.:21.50 3rd Qu.:2018
## Max. :2021-03-18 00:00:00.00 Max. :44.70 Max. :2021
##
## monthValue dayValue hourValue minuteValue timeOfDay
## Min. : 1.000 Min. : 1.00 Min. : 0.0 Min. : 0 Min. : 0.00
## 1st Qu.: 3.000 1st Qu.: 8.00 1st Qu.: 6.0 1st Qu.: 0 1st Qu.: 6.00
## Median : 6.000 Median :16.00 Median :12.0 Median : 0 Median :12.00
## Mean : 6.437 Mean :15.69 Mean :11.5 Mean :15 Mean :11.75
## 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:18.0 3rd Qu.:30 3rd Qu.:18.00
## Max. :12.000 Max. :31.00 Max. :23.0 Max. :30 Max. :23.50
##
## weekOfMonth dayOfWeek
## 1:25543 Sun:28003
## 2:45289 Mon:27959
## 3:45027 Tue:28005
## 4:44897 Wed:28044
## 5:33225 Thu:27956
## 6: 1966 Fri:27992
## Sat:27988
dfTemp$DATETIME <- as.POSIXct(dfTemp$DATETIME)
# Create a sequence of years from min to max year in 'DATETIME'
yearBreaks <- seq(from = floor_date(min(dfTemp$DATETIME), "year"),
to = ceiling_date(max(dfTemp$DATETIME), "year"),
by = "1 year")
# Determine the exact positions for x-axis labels
labelPositions <- as.numeric(yearBreaks)
# Plot using lattice with lines instead of points and adjusted x-axis
temperaturePlot <- xyplot(TEMPERATURE ~ DATETIME, data = dfTemp,
panel = function(x, y, ...) {
# Plot data as a line
panel.xyplot(x, y, type = "l", ...)
# Add vertical lines for each year
panel.abline(v = yearBreaks, col = "red", lty = 2)
},
scales = list(x = list(at = labelPositions, labels = format(yearBreaks, "%Y"),
rot = 0)), # Labels are horizontal
xlab = "Year",
ylab = "Temperature",
main = "Yearly Temperature Vairation in NSW")
# Print the plot
print(temperaturePlot)
Figure 3.1: Temperature Over Time.
The line graph showcasing temperature variations in New South Wales from 2010 to 2021 vividly illustrates a pronounced seasonal pattern, with temperature fluctuations that consistently repeat each year throughout the observed period. Each year within the dataset, temperatures in NSW ascend during the warmer months and descend during the cooler periods, showcasing a clear and predictable cycle of seasonal temperature changes. This regularity is marked by similar patterns emerging year after year, underscoring the stable and predictable nature of seasonal influences on the climate of NSW.
When dealing with multiple forecast data points for each timestamp, taking the median value is an effective method to stabilize and enhance the reliability of the data. Each timestamp may have varying forecasts due to different modeling assumptions. By calculating the median, the discrepancies is , providing a more consistent and robust estimate of demand. This approach helps in reducing noise and ensures that the predictions are less influenced by outliers or extreme variations, making the forecast data more dependable for decision-making and analysis.
# Group the data by 'DATETIME', then calculate the median of 'FORECASTDEMAND' for each group
dfForecast <- dfForecastDemand %>%
group_by(DATETIME) %>% # Group data by the 'DATETIME' column
summarise(MedianForecastDemand = median(FORECASTDEMAND, na.rm = TRUE)) # Calculate median, removing NA values
# View the new dataframe
head(dfForecast)
## # A tibble: 6 × 2
## DATETIME MedianForecastDemand
## <dttm> <dbl>
## 1 2010-01-01 00:00:00 7821.
## 2 2010-01-01 00:30:00 7709.
## 3 2010-01-01 01:00:00 7483.
## 4 2010-01-01 01:30:00 7113.
## 5 2010-01-01 02:00:00 6776.
## 6 2010-01-01 02:30:00 6523.
summary(dfForecast)
## DATETIME MedianForecastDemand
## Min. :2010-01-01 00:00:00 Min. : 4835
## 1st Qu.:2012-10-20 12:00:00 1st Qu.: 7123
## Median :2015-08-10 00:00:00 Median : 8059
## Mean :2015-08-10 00:00:00 Mean : 8105
## 3rd Qu.:2018-05-29 12:00:00 3rd Qu.: 8967
## Max. :2021-03-18 00:00:00 Max. :14583
The ‘dfForecast’ dataset provides a summary of forecasted demand in NSW from January 1, 2010, to March 18, 2021. The date distribution is centered around August 10, 2015, indicating a balanced spread over the analyzed period. Forecasted demand ranges from 4,835 to 14,583, with a median of 8,059 and a mean very close at 8,105, suggesting a relatively symmetric distribution around the central values. The quartile figures, with the first quartile at 7,123 and the third at 8,967, highlight the variability in forecasted demands. This data is crucial for understanding trends and making informed decisions in resource management and planning based on anticipated demand.
head(dfDemand)
## # A tibble: 6 × 10
## DATETIME TOTALDEMAND yearValue monthValue dayValue hourValue
## <dttm> <dbl> <dbl> <dbl> <int> <int>
## 1 2010-01-01 00:00:00 8038 2010 1 1 0
## 2 2010-01-01 00:30:00 7809. 2010 1 1 0
## 3 2010-01-01 01:00:00 7484. 2010 1 1 1
## 4 2010-01-01 01:30:00 7117. 2010 1 1 1
## 5 2010-01-01 02:00:00 6812. 2010 1 1 2
## 6 2010-01-01 02:30:00 6544. 2010 1 1 2
## # ℹ 4 more variables: minuteValue <int>, timeOfDay <dbl>, weekOfMonth <fct>,
## # dayOfWeek <ord>
head(dfHoliday)
## # A tibble: 6 × 5
## Date State Weekday Day_In_Lieu Part_Day_From
## <date> <chr> <chr> <dbl> <dbl>
## 1 2009-01-01 NSW Thursday 0 0
## 2 2009-01-26 NSW Monday 0 0
## 3 2009-04-10 NSW Friday 0 0
## 4 2009-04-11 NSW Saturday 0 0
## 5 2009-04-12 NSW Sunday 0 0
## 6 2009-04-13 NSW Monday 0 0
dfDemand <- dfDemand %>%
mutate(IsHoliday = case_when(
date(DATETIME) %in% dfHoliday$Date ~ 'Holiday', TRUE ~ 'Non-Holiday'
)) %>%
mutate(Date = as.Date(DATETIME))
# Create a violin plot to compare electricity demand on holidays vs non-holidays
ggplot(dfDemand, aes(x = IsHoliday, y = TOTALDEMAND, fill = IsHoliday)) +
geom_violin(trim = FALSE) +
labs(title = "Electricity Demand Comparison on Holidays vs Non-Holidays",
x = "", y = "Total Demand (MW)") +
scale_fill_manual(values = c("Holiday" = "red", "Non-Holiday" = "blue")) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
The violin plot illustrates the differences in electricity demand between holidays and non-holidays. It shows that the median demand on holidays tends to be lower compared to non-holidays. Furthermore, this plot provides insight into the overall distribution and highlights several outliers that fall outside the common range for both holiday and non-holiday periods.
head(dfDemand)
## # A tibble: 6 × 12
## DATETIME TOTALDEMAND yearValue monthValue dayValue hourValue
## <dttm> <dbl> <dbl> <dbl> <int> <int>
## 1 2010-01-01 00:00:00 8038 2010 1 1 0
## 2 2010-01-01 00:30:00 7809. 2010 1 1 0
## 3 2010-01-01 01:00:00 7484. 2010 1 1 1
## 4 2010-01-01 01:30:00 7117. 2010 1 1 1
## 5 2010-01-01 02:00:00 6812. 2010 1 1 2
## 6 2010-01-01 02:30:00 6544. 2010 1 1 2
## # ℹ 6 more variables: minuteValue <int>, timeOfDay <dbl>, weekOfMonth <fct>,
## # dayOfWeek <ord>, IsHoliday <chr>, Date <date>
# Merge the dfDemand and dfTemp based on 'DATETIME'
dfDT <- inner_join(dfDemand, dfTemp, by = "DATETIME")
# Calculate Pearson's correlation coefficient
corrResult <- cor(dfDT$TOTALDEMAND, dfDT$TEMPERATURE, method = "pearson")
# Print the correlation coefficient
print(corrResult)
## [1] 0.1490452
# Perform a significance test
corrTestResult <- cor.test(dfDT$TOTALDEMAND, dfDT$TEMPERATURE, method = "pearson")
print(corrTestResult)
##
## Pearson's product-moment correlation
##
## data: dfDT$TOTALDEMAND and dfDT$TEMPERATURE
## t = 66.721, df = 195945, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1447130 0.1533717
## sample estimates:
## cor
## 0.1490452
Using Pearson’s correlation to assess the relationship between electricity demand and temperature in NSW is well-suited due to both variables being continuous and the assumption that they share a linear relationship.
The Pearson correlation analysis between electricity demand (‘TOTALDEMAND’) and temperature (‘TEMPERATURE’) using the dataset ‘dfMerged’ produced a correlation coefficient (‘cor’) of 0.149. This positive correlation indicates that as the temperature increases, there is a slight upward trend in electricity demand. The results of the Pearson’s test were highly statistically significant, as indicated by an extremely small p-value (less than 2.2e-16), strongly rejecting the null hypothesis that there is no correlation between these two variables.
The confidence interval for this correlation, ranging from 0.1447 to 0.1534, is narrow, suggesting a stable estimate of the correlation coefficient. This interval underscores that, while the correlation is relatively modest, it is consistently different from zero across different samples from the population.
In conclusion, the analysis confirms a statistically significant with positive relationship between temperature and electricity demand. This suggests that higher temperatures may lead to increased electricity usage, possibly due to greater use of cooling systems or other temperature-dependent activities.
# Inner join dfDemand & dfForecast
dfDF <- inner_join(dfDemand, dfForecast, by = "DATETIME")
head(dfDF)
## # A tibble: 6 × 13
## DATETIME TOTALDEMAND yearValue monthValue dayValue hourValue
## <dttm> <dbl> <dbl> <dbl> <int> <int>
## 1 2010-01-01 10:00:00 8067. 2010 1 1 10
## 2 2010-01-01 10:30:00 8123. 2010 1 1 10
## 3 2010-01-01 11:00:00 8196. 2010 1 1 11
## 4 2010-01-01 11:30:00 8221. 2010 1 1 11
## 5 2010-01-01 12:00:00 8278. 2010 1 1 12
## 6 2010-01-01 12:30:00 8362. 2010 1 1 12
## # ℹ 7 more variables: minuteValue <int>, timeOfDay <dbl>, weekOfMonth <fct>,
## # dayOfWeek <ord>, IsHoliday <chr>, Date <date>, MedianForecastDemand <dbl>
# Select the data from 2015 to 2018 from dfDF
dfDF2015_2018 <- dfDF %>%
filter(yearValue >= 2015 & yearValue <= 2018)
# Plot using ggplot2 with smoothed lines using dfFD
plot <- ggplot(dfDF2015_2018, aes(x = DATETIME)) +
geom_smooth(aes(y = MedianForecastDemand, colour = "Median Forecast Demand"),
method = "loess", se = FALSE, linewidth = 1) +
geom_smooth(aes(y = TOTALDEMAND, colour = "Total Demand"),
method = "loess", se = FALSE, linewidth = 1) +
labs(title = "Smoothed Comparison of Median Forecast Demand and Actual Demand from 2015 to 2018",
x = "Year",
y = "Demand") +
scale_colour_manual(values = c("Median Forecast Demand" = "blue", "Total Demand" = "red")) +
theme_minimal()
# Print the plot
print(plot)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
# Calculate MAE
maeValue <- mae(dfDF$TOTALDEMAND, dfDF$MedianForecastDemand)
print(paste("Mean Absolute Error (MAE):", maeValue))
## [1] "Mean Absolute Error (MAE): 1412.68754332215"
# Calculate RMSE
rmseValue <- rmse(dfDF$TOTALDEMAND, dfDF$MedianForecastDemand)
print(paste("Root Mean Squared Error (RMSE):", rmseValue))
## [1] "Root Mean Squared Error (RMSE): 1774.70285666052"
MAE = 1412.6: This value suggests that on average, the predictions of the forecasting model deviate from the actual demand by about 1412.6 units.
RMSE = 1774.7: The value of 1774.7 indicates that while many of the predictions may be reasonably accurate, there are notable instances where the forecast deviates significantly from the actual demand. This could be indicative of the model’s sensitivity to data anomalies or its failure to capture extreme fluctuations due to external factors or rare events.
calculateRMSE <- function(actual, predicted) {
sqrt(mean((predicted - actual)^2, na.rm = TRUE))
}
# Define a range of delays to test, for example, from -12 to 12 hours
delays <- seq(-24, 24, by = 1)
# Initialize a data frame to store the results
results <- data.frame(Delay = integer(), RMSE = numeric())
for (delay in delays) {
# Shift the forecast data by 'delay' hours
dfForecastShifted <- dfForecast %>%
mutate(DATETIME = DATETIME + hours(delay))
# Merge with actual demand data
dfComparison <- inner_join(dfForecastShifted, dfDemand, by = "DATETIME")
# Calculate RMSE for this delay
rmse <- calculateRMSE(dfComparison$TOTALDEMAND, dfComparison$MedianForecastDemand)
# Store the results
results <- rbind(results, data.frame(Delay = delay, RMSE = rmse))
}
# Identify the delay with the minimum RMSE
optimal_delay <- results[which.min(results$RMSE), ]
print(optimal_delay)
## Delay RMSE
## 15 -10 230.4831
The optimal number of delay hour is 10.
dfError <- dfComparison %>%
mutate(
AbsoluteError = abs(TOTALDEMAND - MedianForecastDemand), # Mean Absolute Error component
SquaredError = (TOTALDEMAND - MedianForecastDemand)^2 # Mean Squared Error component
)
# Plotting Absolute Errors over Time
ggplot(dfError, aes(x = DATETIME, y = AbsoluteError)) +
geom_line() +
labs(title = "Absolute Errors Over Time", x = "Year", y = "Absolute Error") +
theme_minimal()
# Plotting Squared Errors over Time
ggplot(dfError, aes(x = DATETIME, y = SquaredError)) +
geom_line() +
labs(title = "Squared Errors Over Time", x = "Datetime", y = "Squared Error") +
theme_minimal()
dfErrorOutlier <- dfError %>%
mutate(
IQR = IQR(AbsoluteError, na.rm = TRUE), # Calculate IQR
UpperBound = quantile(AbsoluteError, 0.75, na.rm = TRUE) + 1.5 * IQR,
LowerBound = quantile(AbsoluteError, 0.25, na.rm = TRUE) - 1.5 * IQR,
Outlier = AbsoluteError > UpperBound | AbsoluteError < LowerBound # Identify outliers
)
# Create groups of consecutive outliers
dfErrorOutlier <- dfErrorOutlier %>%
mutate(
SquaredError = abs(TOTALDEMAND - MedianForecastDemand),
Outlier = SquaredError > (quantile(SquaredError, 0.75, na.rm = TRUE) + 1.5 * IQR(SquaredError, na.rm = TRUE))
) %>%
arrange(DATETIME) %>%
mutate(
GroupChange = Outlier != lag(Outlier, default = first(Outlier)),
Group = cumsum(GroupChange)
)
# Getting intervals for outliers
outlierInterval <- dfErrorOutlier %>%
filter(Outlier) %>%
group_by(Group) %>%
summarise(
Start = min(DATETIME),
End = max(DATETIME)
) %>%
ungroup()
# Plotting Squared Errors over Time with Outlier intervals highlighted
plot <- ggplot() +
geom_rect(data = outlierInterval, aes(xmin = Start, xmax = End, ymin = -Inf, ymax = Inf), fill = "red", alpha = 0.2) +
geom_line(data = dfErrorOutlier, aes(x = DATETIME, y = SquaredError), color = "gray", alpha = 0.5) +
geom_point(data = dfErrorOutlier, aes(x = DATETIME, y = SquaredError, color = Outlier), linewidth = 1, alpha = 0.8) +
scale_color_manual(values = c("TRUE" = "red", "FALSE" = "black")) +
labs(title = "Squared Errors Over Time with Outliers Highlighted") +
theme_minimal() +
theme(
legend.position = "none",
plot.title = element_text(hjust = 0.5, face = "bold") # Center and bold the title
) +
geom_vline(data = data.frame(x = seq(from = floor_date(min(dfErrorOutlier$DATETIME), "year"),
to = ceiling_date(max(dfErrorOutlier$DATETIME), "year"),
by = "1 year")),
aes(xintercept = as.numeric(x)), linetype = "dashed", color = "blue") +
scale_x_datetime(breaks = seq(from = floor_date(min(dfErrorOutlier$DATETIME), "year"),
to = ceiling_date(max(dfErrorOutlier$DATETIME), "year"),
by = "1 year"),
labels = scales::date_format("%Y")) # Formatting labels to show only the year part
## Warning in geom_point(data = dfErrorOutlier, aes(x = DATETIME, y =
## SquaredError, : Ignoring unknown parameters: `linewidth`
print(plot)
# Convert the dataframe to a tsibble object
dfDemandTS <- dfDemand %>%
select(DATETIME, TOTALDEMAND) %>%
as_tsibble(index = DATETIME)
decomposed <- dfDemandTS %>%
model(STL = STL(TOTALDEMAND ~ season(window = "periodic"),
robust = TRUE)) %>%
components()
decomposed %>% autoplot()
# Perform a discrete wavelet transform (DWT)
totalDemand <- as.numeric(dfDemand$TOTALDEMAND)
# Perform a discrete wavelet transform (DWT)
# Choose 'haar' as the wavelet filter and specify the number of decomposition levels (n.levels)
dwtResult <- dwt(totalDemand, filter = "haar", n.levels = 4, boundary = "periodic")
# Reconstruct the time series from wavelet coefficients
reconstructed <- idwt(dwtResult)
# Create a data frame for plotting
plotData <- data.frame(Index = seq_along(totalDemand),
Original = totalDemand,
Reconstructed = reconstructed)
# Plot
ggplot(plotData) +
geom_point(aes(x = Index, y = Original), color = "blue", alpha = 0.5) +
geom_point(aes(x = Index, y = Reconstructed), color = "red", alpha = 0.5) +
labs(title = "Original vs Reconstructed Series",
x = "Index",
y = "Value") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
head(dfDemand)
## # A tibble: 6 × 12
## DATETIME TOTALDEMAND yearValue monthValue dayValue hourValue
## <dttm> <dbl> <dbl> <dbl> <int> <int>
## 1 2010-01-01 00:00:00 8038 2010 1 1 0
## 2 2010-01-01 00:30:00 7809. 2010 1 1 0
## 3 2010-01-01 01:00:00 7484. 2010 1 1 1
## 4 2010-01-01 01:30:00 7117. 2010 1 1 1
## 5 2010-01-01 02:00:00 6812. 2010 1 1 2
## 6 2010-01-01 02:30:00 6544. 2010 1 1 2
## # ℹ 6 more variables: minuteValue <int>, timeOfDay <dbl>, weekOfMonth <fct>,
## # dayOfWeek <ord>, IsHoliday <chr>, Date <date>
dfDemand <- dfDemand %>%
mutate(IsHoliday = ifelse(IsHoliday == "Holiday", 1, 0))
head(dfDemand)
## # A tibble: 6 × 12
## DATETIME TOTALDEMAND yearValue monthValue dayValue hourValue
## <dttm> <dbl> <dbl> <dbl> <int> <int>
## 1 2010-01-01 00:00:00 8038 2010 1 1 0
## 2 2010-01-01 00:30:00 7809. 2010 1 1 0
## 3 2010-01-01 01:00:00 7484. 2010 1 1 1
## 4 2010-01-01 01:30:00 7117. 2010 1 1 1
## 5 2010-01-01 02:00:00 6812. 2010 1 1 2
## 6 2010-01-01 02:30:00 6544. 2010 1 1 2
## # ℹ 6 more variables: minuteValue <int>, timeOfDay <dbl>, weekOfMonth <fct>,
## # dayOfWeek <ord>, IsHoliday <dbl>, Date <date>
dfCorHM <- dfDemand %>%
select(-DATETIME, -Date, -timeOfDay, -minuteValue) %>%
mutate(across(everything(), as.numeric))
corMatrix <- cor(dfCorHM)
corrplot(corMatrix, method = "color", type = "full", order = "hclust",
tl.col = "black", tl.srt = 45,
addCoef.col = "black")
# Extracting the correlation of each variable with TOTALDEMAND
totalDemandCor <- corMatrix["TOTALDEMAND", ]
totalDemandCor <- totalDemandCor[order(-abs(totalDemandCor))] # Sorting by absolute value
# Removing TOTALDEMAND's self-correlation
totalDemandCor <- totalDemandCor[totalDemandCor != 1]
# Viewing the ranked correlations
print(totalDemandCor)
## hourValue yearValue monthValue IsHoliday dayOfWeek weekOfMonth
## 0.476948947 -0.209345123 -0.122574678 -0.121197507 0.043732681 0.005078356
## dayValue
## 0.004951010
# Extract TOTALDEMAND for modeling
totalDemand <- dfDemandTS %>% select(TOTALDEMAND) %>% unlist()
tsData <- ts(totalDemand, frequency = 336) # 48 half-hours per day * 7 days = 336 half-hours per week
adfTest <- adf.test(tsData, alternative = "stationary")
print(adfTest)
##
## Augmented Dickey-Fuller Test
##
## data: tsData
## Dickey-Fuller = -2.1127, Lag order = 73, p-value = 0.5309
## alternative hypothesis: stationary
# First difference
tsDataDiff <- diff(tsData, differences = 1)
# Perform ADF test again
adfTestDiff <- adf.test(tsDataDiff, alternative = "stationary")
## Warning in adf.test(tsDataDiff, alternative = "stationary"): p-value smaller
## than printed p-value
print(adfTestDiff)
##
## Augmented Dickey-Fuller Test
##
## data: tsDataDiff
## Dickey-Fuller = -72.871, Lag order = 73, p-value = 0.01
## alternative hypothesis: stationary
The results from the Augmented Dickey-Fuller (ADF) test on the differenced data (tsDataDiff) indicate that after applying first differencing, the time series has become stationary. The test statistic is significantly negative (-72.871), and the p-value is 0.01, which is well below the conventional threshold of 0.05. This suggests you can reject the null hypothesis of non-stationarity. The warning about the p-value being smaller than printed indicates that the true p-value is very small, reinforcing the conclusion that the series is stationary
dfDemandTS %>%
mutate(demandDiff = difference(TOTALDEMAND, 48)) %>%
ACF(demandDiff) %>%
autoplot()
The ACF drops to 0 at lag =
dfDemandTS %>%
mutate(demandDiff = difference(TOTALDEMAND, 48)) %>%
ACF(demandDiff, lag_max = 336) %>%
autoplot()
dfDemandTS %>%
mutate(demandDiff = difference(TOTALDEMAND, 48)) %>%
PACF(demandDiff) %>%
autoplot()
### PACF Plot for Daily and Weekly Seasonality
dfDemandTS %>%
mutate(demandDiff = difference(TOTALDEMAND, 48)) %>%
PACF(demandDiff, lag_max = 336) %>%
autoplot()
### PACF Plot for Daily and Monthly Seasonality
dfDemandTS %>%
mutate(demandDiff = difference(TOTALDEMAND, 48)) %>%
PACF(demandDiff, lag_max = 1440) %>%
autoplot()
# Check for duplicates
duplicates <- duplicated(dfDemand)
# View the rows that are duplicates
dfDemand[duplicates, ]
## # A tibble: 0 × 12
## # ℹ 12 variables: DATETIME <dttm>, TOTALDEMAND <dbl>, yearValue <dbl>,
## # monthValue <dbl>, dayValue <int>, hourValue <int>, minuteValue <int>,
## # timeOfDay <dbl>, weekOfMonth <fct>, dayOfWeek <ord>, IsHoliday <dbl>,
## # Date <date>
head(dfDemand)
## # A tibble: 6 × 12
## DATETIME TOTALDEMAND yearValue monthValue dayValue hourValue
## <dttm> <dbl> <dbl> <dbl> <int> <int>
## 1 2010-01-01 00:00:00 8038 2010 1 1 0
## 2 2010-01-01 00:30:00 7809. 2010 1 1 0
## 3 2010-01-01 01:00:00 7484. 2010 1 1 1
## 4 2010-01-01 01:30:00 7117. 2010 1 1 1
## 5 2010-01-01 02:00:00 6812. 2010 1 1 2
## 6 2010-01-01 02:30:00 6544. 2010 1 1 2
## # ℹ 6 more variables: minuteValue <int>, timeOfDay <dbl>, weekOfMonth <fct>,
## # dayOfWeek <ord>, IsHoliday <dbl>, Date <date>
head(dfTemp)
## # A tibble: 6 × 10
## DATETIME TEMPERATURE yearValue monthValue dayValue hourValue
## <dttm> <dbl> <dbl> <dbl> <int> <int>
## 1 2010-01-01 00:00:00 23.1 2010 1 1 0
## 2 2010-01-01 00:30:00 22.9 2010 1 1 0
## 3 2010-01-01 01:00:00 22.6 2010 1 1 1
## 4 2010-01-01 01:30:00 22.5 2010 1 1 1
## 5 2010-01-01 02:00:00 22.5 2010 1 1 2
## 6 2010-01-01 02:30:00 22.4 2010 1 1 2
## # ℹ 4 more variables: minuteValue <int>, timeOfDay <dbl>, weekOfMonth <fct>,
## # dayOfWeek <ord>
# Prepare dfTemp with DATETIME, TEMPERATURE
dfTemp <- dfTemp %>%
select(c(DATETIME, TEMPERATURE))
head(dfTemp)
## # A tibble: 6 × 2
## DATETIME TEMPERATURE
## <dttm> <dbl>
## 1 2010-01-01 00:00:00 23.1
## 2 2010-01-01 00:30:00 22.9
## 3 2010-01-01 01:00:00 22.6
## 4 2010-01-01 01:30:00 22.5
## 5 2010-01-01 02:00:00 22.5
## 6 2010-01-01 02:30:00 22.4
# Check for duplicates
duplicates <- duplicated(dfTemp)
# View the rows that are duplicates
dfTemp[duplicates, ]
## # A tibble: 13 × 2
## DATETIME TEMPERATURE
## <dttm> <dbl>
## 1 2011-01-01 00:00:00 21
## 2 2011-10-10 10:30:00 18.9
## 3 2011-10-10 18:30:00 16.1
## 4 2011-10-10 19:30:00 15.5
## 5 2012-01-01 00:00:00 15.4
## 6 2013-01-01 00:00:00 21
## 7 2014-01-01 00:00:00 20.4
## 8 2015-01-01 00:00:00 20.9
## 9 2016-01-01 00:00:00 16.9
## 10 2017-01-01 00:00:00 22.6
## 11 2018-01-01 00:00:00 22.4
## 12 2019-01-01 00:00:00 22.3
## 13 2020-01-01 00:00:00 19.4
# Merge dfDemand and dfTemp by DATETIME
dfDT <- left_join(dfDemand, dfTemp, by = 'DATETIME')
head(dfDT)
## # A tibble: 6 × 13
## DATETIME TOTALDEMAND yearValue monthValue dayValue hourValue
## <dttm> <dbl> <dbl> <dbl> <int> <int>
## 1 2010-01-01 00:00:00 8038 2010 1 1 0
## 2 2010-01-01 00:30:00 7809. 2010 1 1 0
## 3 2010-01-01 01:00:00 7484. 2010 1 1 1
## 4 2010-01-01 01:30:00 7117. 2010 1 1 1
## 5 2010-01-01 02:00:00 6812. 2010 1 1 2
## 6 2010-01-01 02:30:00 6544. 2010 1 1 2
## # ℹ 7 more variables: minuteValue <int>, timeOfDay <dbl>, weekOfMonth <fct>,
## # dayOfWeek <ord>, IsHoliday <dbl>, Date <date>, TEMPERATURE <dbl>
# Check for duplicates
duplicates <- duplicated(dfDT)
# View the rows that are duplicates
dfDT[duplicates, ]
## # A tibble: 13 × 13
## DATETIME TOTALDEMAND yearValue monthValue dayValue hourValue
## <dttm> <dbl> <dbl> <dbl> <int> <int>
## 1 2011-01-01 00:00:00 8063. 2011 1 1 0
## 2 2011-10-10 10:30:00 9039. 2011 10 10 10
## 3 2011-10-10 18:30:00 9221. 2011 10 10 18
## 4 2011-10-10 19:30:00 9021. 2011 10 10 19
## 5 2012-01-01 00:00:00 7079. 2012 1 1 0
## 6 2013-01-01 00:00:00 7359. 2013 1 1 0
## 7 2014-01-01 00:00:00 7010. 2014 1 1 0
## 8 2015-01-01 00:00:00 7058. 2015 1 1 0
## 9 2016-01-01 00:00:00 7140. 2016 1 1 0
## 10 2017-01-01 00:00:00 7431. 2017 1 1 0
## 11 2018-01-01 00:00:00 7035. 2018 1 1 0
## 12 2019-01-01 00:00:00 7613. 2019 1 1 0
## 13 2020-01-01 00:00:00 7319. 2020 1 1 0
## # ℹ 7 more variables: minuteValue <int>, timeOfDay <dbl>, weekOfMonth <fct>,
## # dayOfWeek <ord>, IsHoliday <dbl>, Date <date>, TEMPERATURE <dbl>
countNA(dfDT)
## True
## Total number of NA values: 579
dfDT$TEMPERATURE <- na.spline(dfDT$TEMPERATURE)
dfModel <- dfDT %>%
select(DATETIME, TOTALDEMAND, TEMPERATURE, hourValue, yearValue, monthValue, IsHoliday)
head(dfModel)
## # A tibble: 6 × 7
## DATETIME TOTALDEMAND TEMPERATURE hourValue yearValue monthValue
## <dttm> <dbl> <dbl> <int> <dbl> <dbl>
## 1 2010-01-01 00:00:00 8038 23.1 0 2010 1
## 2 2010-01-01 00:30:00 7809. 22.9 0 2010 1
## 3 2010-01-01 01:00:00 7484. 22.6 1 2010 1
## 4 2010-01-01 01:30:00 7117. 22.5 1 2010 1
## 5 2010-01-01 02:00:00 6812. 22.5 2 2010 1
## 6 2010-01-01 02:30:00 6544. 22.4 2 2010 1
## # ℹ 1 more variable: IsHoliday <dbl>
# Check NA
countNA(dfModel)
## False
## No NA values present.
# Check for duplicates
duplicates <- duplicated(dfModel)
# View the rows that are duplicates
dfModel[duplicates, ]
## # A tibble: 13 × 7
## DATETIME TOTALDEMAND TEMPERATURE hourValue yearValue monthValue
## <dttm> <dbl> <dbl> <int> <dbl> <dbl>
## 1 2011-01-01 00:00:00 8063. 21 0 2011 1
## 2 2011-10-10 10:30:00 9039. 18.9 10 2011 10
## 3 2011-10-10 18:30:00 9221. 16.1 18 2011 10
## 4 2011-10-10 19:30:00 9021. 15.5 19 2011 10
## 5 2012-01-01 00:00:00 7079. 15.4 0 2012 1
## 6 2013-01-01 00:00:00 7359. 21 0 2013 1
## 7 2014-01-01 00:00:00 7010. 20.4 0 2014 1
## 8 2015-01-01 00:00:00 7058. 20.9 0 2015 1
## 9 2016-01-01 00:00:00 7140. 16.9 0 2016 1
## 10 2017-01-01 00:00:00 7431. 22.6 0 2017 1
## 11 2018-01-01 00:00:00 7035. 22.4 0 2018 1
## 12 2019-01-01 00:00:00 7613. 22.3 0 2019 1
## 13 2020-01-01 00:00:00 7319. 19.4 0 2020 1
## # ℹ 1 more variable: IsHoliday <dbl>
# Remove duplicates
dfModel <- dfModel[!duplicated(dfModel), ]
# View the first few rows of the data frame without duplicates
head(dfModel)
## # A tibble: 6 × 7
## DATETIME TOTALDEMAND TEMPERATURE hourValue yearValue monthValue
## <dttm> <dbl> <dbl> <int> <dbl> <dbl>
## 1 2010-01-01 00:00:00 8038 23.1 0 2010 1
## 2 2010-01-01 00:30:00 7809. 22.9 0 2010 1
## 3 2010-01-01 01:00:00 7484. 22.6 1 2010 1
## 4 2010-01-01 01:30:00 7117. 22.5 1 2010 1
## 5 2010-01-01 02:00:00 6812. 22.5 2 2010 1
## 6 2010-01-01 02:30:00 6544. 22.4 2 2010 1
## # ℹ 1 more variable: IsHoliday <dbl>
# Check for duplicates
duplicates <- duplicated(dfModel)
# View the rows that are duplicates
dfModel[duplicates, ]
## # A tibble: 0 × 7
## # ℹ 7 variables: DATETIME <dttm>, TOTALDEMAND <dbl>, TEMPERATURE <dbl>,
## # hourValue <int>, yearValue <dbl>, monthValue <dbl>, IsHoliday <dbl>
str(dfModel)
## tibble [196,513 × 7] (S3: tbl_df/tbl/data.frame)
## $ DATETIME : POSIXct[1:196513], format: "2010-01-01 00:00:00" "2010-01-01 00:30:00" ...
## $ TOTALDEMAND: num [1:196513] 8038 7809 7484 7117 6812 ...
## $ TEMPERATURE: num [1:196513] 23.1 22.9 22.6 22.5 22.5 22.4 22.3 22.3 22.1 22.2 ...
## $ hourValue : int [1:196513] 0 0 1 1 2 2 3 3 4 4 ...
## $ yearValue : num [1:196513] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ monthValue : num [1:196513] 1 1 1 1 1 1 1 1 1 1 ...
## $ IsHoliday : num [1:196513] 1 1 1 1 1 1 1 1 1 1 ...
# Calculate the start date for the last 5 years from the last available date in the dataset
endDate <- max(dfModel$DATETIME) - years(3)
startDate <- endDate - years(5)
# Filter the dataset to include only data from the last 5 years
df5Yr <- dfModel %>%
filter(DATETIME >= startDate & DATETIME <= endDate)
# Calculate the number of rows to include in the training set
trainSize <- floor(0.7 * nrow(df5Yr))
# Create the training and testing datasets
trainSet <- df5Yr[1:trainSize, ]
testSet <- df5Yr[(trainSize + 1):nrow(df5Yr), ]
print(testSet)
## # A tibble: 26,295 × 7
## DATETIME TOTALDEMAND TEMPERATURE hourValue yearValue monthValue
## <dttm> <dbl> <dbl> <int> <dbl> <dbl>
## 1 2016-09-16 05:00:00 6228. 13.7 5 2016 9
## 2 2016-09-16 05:30:00 6478. 13.5 5 2016 9
## 3 2016-09-16 06:00:00 6823. 12.7 6 2016 9
## 4 2016-09-16 06:30:00 7473. 13.2 6 2016 9
## 5 2016-09-16 07:00:00 7968. 13.8 7 2016 9
## 6 2016-09-16 07:30:00 8269. 13.4 7 2016 9
## 7 2016-09-16 08:00:00 8461. 13.4 8 2016 9
## 8 2016-09-16 08:30:00 8393. 14.6 8 2016 9
## 9 2016-09-16 09:00:00 8204. 16.2 9 2016 9
## 10 2016-09-16 09:30:00 8083. 17.2 9 2016 9
## # ℹ 26,285 more rows
## # ℹ 1 more variable: IsHoliday <dbl>
print(trainSet)
## # A tibble: 61,354 × 7
## DATETIME TOTALDEMAND TEMPERATURE hourValue yearValue monthValue
## <dttm> <dbl> <dbl> <int> <dbl> <dbl>
## 1 2013-03-18 00:00:00 6653. 16.4 0 2013 3
## 2 2013-03-18 00:30:00 6544. 15.5 0 2013 3
## 3 2013-03-18 01:00:00 6375. 15.3 1 2013 3
## 4 2013-03-18 01:30:00 6192. 14.9 1 2013 3
## 5 2013-03-18 02:00:00 6027. 14.4 2 2013 3
## 6 2013-03-18 02:30:00 5902. 14 2 2013 3
## 7 2013-03-18 03:00:00 5845. 13.9 3 2013 3
## 8 2013-03-18 03:30:00 5861. 13.8 3 2013 3
## 9 2013-03-18 04:00:00 5929. 13.6 4 2013 3
## 10 2013-03-18 04:30:00 6210. 13.3 4 2013 3
## # ℹ 61,344 more rows
## # ℹ 1 more variable: IsHoliday <dbl>
cat("Number of observations in the training set:", nrow(trainSet), "\n")
## Number of observations in the training set: 61354
cat("Number of observations in the testing set:", nrow(testSet), "\n")
## Number of observations in the testing set: 26295
tsTrainSet <- trainSet %>%
as_tsibble(index = DATETIME)
# Fill gaps in the time series
tsTrainSet <- tsTrainSet %>%
fill_gaps(TOTALDEMAND = NA) # Fill Gap with NA
print(nrow(tsTrainSet))
## [1] 61354
tsTestSet <- testSet %>%
as_tsibble(index = DATETIME)
# Fill gaps in the time series
tsTestSet <- tsTestSet %>%
fill_gaps(TOTALDEMAND = NA) # Fill Gap values with NA
print(nrow(tsTestSet))
## [1] 26295
# Train SARIMA model with first order differencing and Fourier daily seasonality
sarimaD1FrDModel <- tsTrainSet %>%
model(
sarimaD1FrD = ARIMA(TOTALDEMAND ~ pdq(0,1,0) +
PDQ(0,0,0) +
fourier(K=3, period=48))
)
# Train SARIMA model with first order differencing, Fourier daily and weekly seasonality
sarimD1FrDWModel <- tsTrainSet %>%
model(
sarimD1FrDW = ARIMA(TOTALDEMAND ~ pdq(0,1,0) +
PDQ(0,0,0) +
fourier(K=3, period=48) +
fourier(K=3, period=48*7))
)
# Train SARIMA model with first order differencing, Fourier daily and weekly seasonality and TEMPERATURE
sarimD1FrDWTempModel <- tsTrainSet %>%
model(
sarimD1FrDWTemp = ARIMA(TOTALDEMAND ~ pdq(0,1,0) +
PDQ(0,0,0) +
fourier(K=3, period=48) +
fourier(K=3, period=48*7) +
TEMPERATURE)
)
sarimaD1FrDModel %>% glance(fit) %>% arrange(BIC) %>% select(.model:BIC)
## # A tibble: 1 × 6
## .model sigma2 log_lik AIC AICc BIC
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 sarimaD1FrD 21032. -392401. 784816. 784816. 784879.
sarimD1FrDWModel %>% glance(fit) %>% arrange(BIC) %>% select(.model:BIC)
## # A tibble: 1 × 6
## .model sigma2 log_lik AIC AICc BIC
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 sarimD1FrDW 20950. -392278. 784581. 784581. 784699.
sarimD1FrDWTempModel %>% glance(fit) %>% arrange(BIC) %>% select(.model:BIC)
## # A tibble: 1 × 6
## .model sigma2 log_lik AIC AICc BIC
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 sarimD1FrDWTemp 20614. -391782. 783591. 783591. 783717.
calFcAcc <- function(fcFable, actualTs) {
# Prepare forecast data
avgFc <- fcFable %>%
as_tibble() %>%
select(DATETIME, .mean)
# SJoin forecast data with actual data
accData <- inner_join(avgFc, actualTs, by = "DATETIME")
# Calculate MAE
mae <- mean(abs(accData$.mean - accData$TOTALDEMAND), na.rm = TRUE)
# Calculate RMSE
rmse <- sqrt(mean((accData$.mean - accData$TOTALDEMAND)^2, na.rm = TRUE))
cat("Mean Absolute Error (MAE):", mae, "\n")
cat("Root Mean Squared Error (RMSE):", rmse, "\n")
}
# Forecasting using the model with first order differencing and fourier daily seasonality
sarimaD1FrDFc <- sarimaD1FrDModel %>%
forecast(new_data = tsTestSet)
# Print the forecast results
calFcAcc(sarimaD1FrDFc, tsTestSet)
## Mean Absolute Error (MAE): 712.7173
## Root Mean Squared Error (RMSE): 988.6672
# Forecasting using the model with first order differencing, daily and weekly seasonality using Fourier terms
sarimD1FrDWFc <- sarimD1FrDWModel %>%
forecast(new_data = tsTestSet)
# Print the forecast results
calFcAcc(sarimD1FrDWFc, tsTestSet)
## Mean Absolute Error (MAE): 725.3482
## Root Mean Squared Error (RMSE): 1034.277
# Forecasting using the model with first order differencing, daily seasonality, Fourier weekly seasonality, and temperature
sarimD1FrDWTempFc <- sarimD1FrDWTempModel %>%
forecast(new_data = tsTestSet)
# Print the forecast results
calFcAcc(sarimD1FrDWTempFc, tsTestSet)
## Mean Absolute Error (MAE): 750.4085
## Root Mean Squared Error (RMSE): 1076.07
fableTb <- function(fable){
tb <- fable %>%
as_tibble() %>%
select(DATETIME, .mean)
return(tb)
}
head(tsTestSet)
## # A tsibble: 6 x 7 [30m] <Australia/Brisbane>
## DATETIME TOTALDEMAND TEMPERATURE hourValue yearValue monthValue
## <dttm> <dbl> <dbl> <int> <dbl> <dbl>
## 1 2016-09-16 05:00:00 6228. 13.7 5 2016 9
## 2 2016-09-16 05:30:00 6478. 13.5 5 2016 9
## 3 2016-09-16 06:00:00 6823. 12.7 6 2016 9
## 4 2016-09-16 06:30:00 7473. 13.2 6 2016 9
## 5 2016-09-16 07:00:00 7968. 13.8 7 2016 9
## 6 2016-09-16 07:30:00 8269. 13.4 7 2016 9
## # ℹ 1 more variable: IsHoliday <dbl>
# Join the datasets by DateTime
tbSarimaD1FrDFc <- fableTb(sarimaD1FrDFc) %>%
rename(sarimaD1FrD_mean = .mean)
tbSarimD1FrDWFc <- fableTb(sarimD1FrDWFc) %>%
rename(sarimD1FrDW_mean = .mean)
tbSarimD1FrDWTempFc <- fableTb(sarimD1FrDWTempFc) %>%
rename(sarimD1FrDWTemp_mean = .mean)
joinData <- reduce(list(tbSarimaD1FrDFc, tbSarimD1FrDWFc, tbSarimD1FrDWTempFc, tsTestSet), full_join, by = "DATETIME")
# Plot the data
plot <- ggplot(joinData, aes(x = DATETIME)) +
geom_smooth(aes(y = TOTALDEMAND, color = "Actual Demand"), method = "loess", se = FALSE, linewidth = 1) +
geom_smooth(aes(y = sarimaD1FrD_mean, color = "SarimaD1FrD Forecast"), method = "loess", se = FALSE, linewidth = 1) +
geom_smooth(aes(y = sarimD1FrDW_mean, color = "SarimD1FrDW Forecast"), method = "loess", se = FALSE, linewidth = 1) +
geom_smooth(aes(y = sarimD1FrDWTemp_mean, color = "SarimD1FrDWTemp Forecast"), method = "loess", se = FALSE, linewidth = 1) +
labs(title = "Comparison of Actual Demand vs Forecasts",
x = "DateTime",
y = "Demand",
color = "Legend") +
theme_minimal() +
scale_color_manual(values = c("Actual Demand" = "black",
"SarimaD1FrD Forecast" = "blue",
"SarimD1FrDW Forecast" = "red",
"SarimD1FrDWTemp Forecast" = "green"))
# Print the plot
print(plot)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'